library(data.table)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(stats)
library(tidyverse)
## ─ Attaching packages ──────────────────────────── tidyverse 1.3.0 ─
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ─ Conflicts ───────────────────────────── tidyverse_conflicts() ─
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
options("scipen" = 100, "digits" = 4)
machine_data = fread('/Users/robinho/Nutstore Files/01 ESADE/Study/05 MSc Business Analytics 19_20/04 Term2/01 R language/Data Analytics with R/Challenge/Challenge 2/DATA/machine_data.csv')
product_data = fread('/Users/robinho/Nutstore Files/01 ESADE/Study/05 MSc Business Analytics 19_20/04 Term2/01 R language/Data Analytics with R/Challenge/Challenge 2/DATA/product_data.csv')
transactional_data = fread('/Users/robinho/Nutstore Files/01 ESADE/Study/05 MSc Business Analytics 19_20/04 Term2/01 R language/Data Analytics with R/Challenge/Challenge 2/DATA/transactional_data.csv')
machine_failures = fread('/Users/robinho/Nutstore Files/01 ESADE/Study/05 MSc Business Analytics 19_20/04 Term2/01 R language/Data Analytics with R/Challenge/Challenge 2/DATA/machine_failures.csv')
1. Merge the transactional dataset with the machine failures data set setting failure variable to 0 when no failure is recorded.
# 1-1. Merge machine_faliures with transactional data
complete_data = merge(transactional_data,
machine_failures,
by=c("machine", "timestamp", "column"),
all.x=T)
# 1-2. Set all the cols in "failure" for days without failure
complete_data[is.na(failure), failure:=0]
complete_data
2. In the transactional data table, create a variable called last_vend containing the timestamp of the previous sale of each machine.
# 2-1. Order the table
complete_data[order(machine, timestamp)]
# 2-2. Shift the column 'timestamp' to create the column 'last_vend'
complete_data[, last_vend := shift(timestamp,1), by = machine]
complete_data
3. Create a new variable in the transactional data table called deltahours containing, for every sale, the hours that passed since the last sale.
complete_data[, deltahours := difftime(timestamp, last_vend, units='hours')]
complete_data
4. Create an auxiliary data table called machine_daily_average with the average daily sales per machine. Use this auxiliary table to attach to every row of the transactional data table the the average daily sales per machine. You can do this by doing a merge.
# 4-1. Sales per machine
# Comments: To calculate the transactions per machine, we consider active days to be also the ones where the machine have a failure.
num_prod = complete_data[, .(num_prod = .N), by = "machine"]
head(num_prod)
# 4-2. Number of days that each machine was working
working_days = complete_data[, .(num_days = uniqueN(date)), by = "machine"]
head(working_days)
# 4-3. Average daily sales per machine
machine_daily_average = working_days[,
.(daily_sales_machine = num_prod$num_prod/num_days,
machine)]
# 4-4. Get the complete_data with machine_daily_average column
complete_data = merge(complete_data,
machine_daily_average,
by = "machine")
complete_data
5. Create a new variable called delta in the transactional data table containing a normalized version of deltahours consisting on the deltahours associated with each sale divided by the average deltahours of each machine i.e. delta = deltahours /(24/daily_sales_machine). The interpretation of delta is the amount of missed sales if the machine was selling at a constant rate.
complete_data[, delta := as.numeric(deltahours) / (24 / daily_sales_machine)]
complete_data
6. Select 30% of the machines in the transactional data for testing and 70% of the machines for training and train a linear logistic regression model called m to predict whether a machine has a failure as a function of variable delta.
# 6-1. Set the machine list
machine_list = unique(complete_data$machine)
# 6-2. Extract the training machine list & dataset
set.seed(5)
training_machine_list = sample(machine_list,
round(0.7*length(machine_list), 0),
replace = F)
train = complete_data[machine %in% training_machine_list]
# 6-3. Extract the test machine list & dataset
test_machine_list = setdiff(machine_list, training_machine_list)
test = complete_data[machine %in% test_machine_list]
# 6-4. Build the model
m = glm(failure ~ delta, data = train, family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# 6-5. Model information
summary(m)
##
## Call:
## glm(formula = failure ~ delta, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.037 -0.062 -0.048 -0.045 3.718
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.91221 0.02307 -300 <0.0000000000000002 ***
## delta 0.56273 0.00314 179 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 103916 on 1287048 degrees of freedom
## Residual deviance: 58491 on 1287047 degrees of freedom
## (1746 observations deleted due to missingness)
## AIC: 58495
##
## Number of Fisher Scoring iterations: 9
delta: 0.56273.